Loading required libraries

library(Hmisc)
library(caTools)
library(randomForest)
library(ggplot2)
library(plotly)
library(e1071)
library(ROCR)
library(pROC)

Reading and summarizaiton of data

data <- read.csv("results.csv") 
data<-data[(data$Year>="2008"),]
summary(data)
##        date             home_team         away_team      home_score    
##  2/29/12 :  66   Mexico      : 112   Costa Rica:  90   Min.   : 0.000  
##  3/29/16 :  63   Qatar       : 110   Zambia    :  89   1st Qu.: 0.000  
##  3/26/08 :  60   Japan       : 106   Korea DPR :  83   Median : 1.000  
##  3/5/14  :  59   USA         : 106   Cameroon  :  80   Mean   : 1.585  
##  11/14/12:  56   South Africa: 102   Iraq      :  79   3rd Qu.: 2.000  
##  10/11/11:  54   Oman        :  97   Syria     :  79   Max.   :17.000  
##  (Other) :9593   (Other)     :9318   (Other)   :9451                   
##    away_score                                    tournament  
##  Min.   : 0.000   Friendly                            :3736  
##  1st Qu.: 0.000   FIFA World Cup qualification        :2314  
##  Median : 1.000   UEFA Euro qualification             : 516  
##  Mean   : 1.095   African Cup of Nations qualification: 453  
##  3rd Qu.: 2.000   AFC Asian Cup qualification         : 201  
##  Max.   :20.000   African Cup of Nations              : 189  
##                   (Other)                             :2542  
##             city              country      neutral             Year     
##  Doha         : 166   USA         : 374   Mode :logical   Min.   :2008  
##  Dar es Salaam:  84   South Africa: 320   FALSE:7042      1st Qu.:2010  
##  London       :  84   France      : 235   TRUE :2909      Median :2013  
##  Muscat       :  83   Qatar       : 181                   Mean   :2013  
##  Amman        :  79   England     : 170                   3rd Qu.:2015  
##  Kampala      :  79   Sweden      : 161                   Max.   :2018  
##  (Other)      :9376   (Other)     :8510                                 
##      Month       
##  Min.   : 1.000  
##  1st Qu.: 5.000  
##  Median : 7.000  
##  Mean   : 6.956  
##  3rd Qu.:10.000  
##  Max.   :12.000  
## 

How Data Looks:

head(data)
##          date home_team away_team home_score away_score tournament
## 29719  1/2/08    Kuwait   Lebanon          3          2   Friendly
## 29720  1/5/08     Egypt   Namibia          3          0   Friendly
## 29721  1/6/08   Tunisia    Zambia          1          2   Friendly
## 29722  1/8/08   Tunisia    Zambia          1          0   Friendly
## 29723  1/9/08   Nigeria     Sudan          2          0   Friendly
## 29724 1/10/08     Egypt      Mali          1          0   Friendly
##            city              country neutral Year Month
## 29719   Salmiya               Kuwait   FALSE 2008     1
## 29720     Aswan                Egypt   FALSE 2008     1
## 29721     Radès              Tunisia   FALSE 2008     1
## 29722     Radès              Tunisia   FALSE 2008     1
## 29723  Estepona                Spain    TRUE 2008     1
## 29724 Abu Dhabi United Arab Emirates    TRUE 2008     1
#Checking Missing Values

missing <- as.data.frame(apply(data, 2, FUN = function(x) sum(is.na(x))))
colnames(missing) <- "Missing_Count"
missing$Missing_Per <- (missing$Missing_Count/(nrow(data))*100)
missing$Missing_Per <- round(missing$Missing_Per,0)
missing
##            Missing_Count Missing_Per
## date                   0           0
## home_team              0           0
## away_team              0           0
## home_score             0           0
## away_score             0           0
## tournament             0           0
## city                   0           0
## country                0           0
## neutral                0           0
## Year                   0           0
## Month                  0           0

Exploratory Analysis

Top 10 Match Host

# Top ten match hosts 

host<-as.data.frame(tail(sort(table(data$country)),10))
colnames(host) <- c("Country", "No_of_Matches")

# Draw plot
graph_1<-ggplot(host, aes(x=Country,y=No_of_Matches)) + 
  geom_bar(stat="identity", width=.75, fill="Red") + 
  labs(title="Top 10 Host", 
       subtitle="2008-2018", 
       caption="source: Kaggle") 
graph_1

Home Advantage?

home_win <- (as.character(data$country) == as.character(data$home_team)) & (as.integer(data$home_score) >= as.integer(data$away_score))  
away_win <- (as.character(data$country) == as.character(data$home_team)) & (as.integer(data$home_score) < as.integer(data$away_score))  

home_win<-as.data.frame(table(home_win))
away_win<-as.data.frame(table(away_win))


graph_2_a <- plot_ly(home_win, x = ~home_win, y = ~Freq, type = 'bar',
             marker = list(color = c('rgba(204,204,204,1)', 'rgba(222,45,38,0.8)'))) %>%
  layout(title = "Home Wins",
         yaxis = list(title = "# of Matches"))

graph_2_b <- plot_ly(away_win, x = ~away_win, y = ~Freq, type = 'bar',
             marker = list(color = c('rgba(204,204,204,1)', 'rgba(222,45,38,0.8)'))) %>%
  layout(title = "Away Wins",
         yaxis = list(title = "# of Matches"))
graph_2_a 
graph_2_b

BEST TEAMS OF ALL THE TIME

best_team <- (0)
for (i in 1:length(data$home_team))
  {
    x <- ifelse(data[i,4,] >= data[i,5,], as.character(data[i,2,]), as.character(data[i,3,]));
    best_team <- c(best_team, x)
  }

best_team<-as.data.frame(tail(sort(table(best_team)),10))

graph_3<-ggplot(best_team, aes(x=best_team,y=Freq)) + 
  geom_bar(stat="identity", width=.75, fill="Blue") + 
  coord_flip() +
  labs(title="Top 10 Home Teams", 
       subtitle="2008-2018", 
       caption="source: Kaggle") 
graph_3

Top 10 TOURNAMENTS

tournament<-as.data.frame(tail(sort(table(data$tournament)),10))
colnames(tournament) <- c("Tournament", "No_of_Matches")


graph_4<-plot_ly(tournament, x=~Tournament, y=~No_of_Matches, type = 'bar') %>% layout(title = "Top Tournaments")
graph_4

Brazil trend of playing matches over the years

temp<-data.frame(data$home_team,data$away_team,data$Year)
team<-"Brazil"
colnames(temp) <- c("home_team", "away_team","year")
temp <- temp[temp$home_team == team | temp$away_team ==team,]
team_trend<-as.data.frame(tail(sort(table(temp$year)),10))
colnames(team_trend) <- c("year", "freq")
team_trend$year<-as.character(team_trend$year)

graph_5<-ggplot(data=team_trend, aes(x=year, y=freq, group=1)) +
  geom_line(colour="blue", linetype="dashed", size=1.5) + 
  geom_point(colour="blue", size=4, shape=21, fill="white")
graph_5

Reading data for top Teams, Stadiums and Tournaments

data <- read.csv("results.csv", stringsAsFactors = FALSE) 
home<-read.csv("best_teams.csv")
country<-read.csv("country.csv")
tournament<-read.csv("tournament.csv")

Feature Engineering

Creating a winner variable

for(i in 1:length(data$home_team))
{
  if(data$home_score[i]>data$away_score[i])
  {
    data$Winner[i]<-"Home"
  }
  else if(data$home_score[i]<data$away_score[i])
  {
    data$Winner[i]<-"Away"
  }
  else
  {
    data$Winner[i]<-"Tie"
  }
}
head(data)
##         date home_team away_team home_score away_score tournament    city
## 1 1872-11-30  Scotland   England          0          0   Friendly Glasgow
## 2 1873-03-08   England  Scotland          4          2   Friendly  London
## 3 1874-03-07  Scotland   England          2          1   Friendly Glasgow
## 4 1875-03-06   England  Scotland          2          2   Friendly  London
## 5 1876-03-04  Scotland   England          3          0   Friendly Glasgow
## 6 1876-03-25  Scotland     Wales          4          0   Friendly Glasgow
##    country neutral Year Month Winner
## 1 Scotland   FALSE 1872    11    Tie
## 2  England   FALSE 1873     3   Home
## 3 Scotland   FALSE 1874     3   Home
## 4  England   FALSE 1875     3    Tie
## 5 Scotland   FALSE 1876     3   Home
## 6 Scotland   FALSE 1876     3   Home

Binning of Home and Away Team

for ( i in 1:length(data$home_team))
{
  if (data$home_team[i] %nin% home$Teams){
    data[i,"home_team"] <- "others"
  }
}

for ( i in 1:length(data$away_team))
{
  if (data$away_team[i] %nin% home$Teams){
    data[i,"away_team"] <- "others"
  }
}
unique(data$home_team)
##  [1] "Scotland"            "England"             "others"             
##  [4] "USA"                 "Uruguay"             "Austria"            
##  [7] "Hungary"             "Argentina"           "Belgium"            
## [10] "France"              "Netherlands"         "Switzerland"        
## [13] "Sweden"              "Germany"             "Italy"              
## [16] "Norway"              "Russia"              "Denmark"            
## [19] "Brazil"              "Japan"               "Paraguay"           
## [22] "Spain"               "Poland"              "Yugoslavia"         
## [25] "Romania"             "Portugal"            "China"              
## [28] "Australia"           "Turkey"              "Mexico"             
## [31] "Egypt"               "Bulgaria"            "Kenya"              
## [34] "Uganda"              "Ireland"             "Trinidad and Tobago"
## [37] "Zimbabwe"            "Zambia"              "Iran"               
## [40] "Korea Republic"      "Ghana"               "Nigeria"            
## [43] "Indonesia"           "Tunisia"             "Malawi"             
## [46] "Morocco"             "Ivory Coast"         "Iraq"               
## [49] "Thailand"            "Senegal"             "Algeria"

Binning of Tournaments

for ( i in 1:length(data$tournament))
{
  if (data$tournament[i] %nin% tournament$Tournament){
    data[i,"tournament"] <- "others"
  }
}
unique(data$tournament)
##  [1] "Friendly"                            
##  [2] "British Championship"                
##  [3] "others"                              
##  [4] "Copa América"                        
##  [5] "Nordic Championship"                 
##  [6] "International Cup"                   
##  [7] "Baltic Cup"                          
##  [8] "Balkan Cup"                          
##  [9] "FIFA World Cup"                      
## [10] "FIFA World Cup qualification"        
## [11] "CCCF Championship"                   
## [12] "AFC Asian Cup qualification"         
## [13] "AFC Asian Cup"                       
## [14] "African Cup of Nations"              
## [15] "Merdeka Tournament"                  
## [16] "UEFA Euro qualification"             
## [17] "UEFA Euro"                           
## [18] "Windward Islands Tournament"         
## [19] "African Cup of Nations qualification"
## [20] "Vietnam Independence Cup"            
## [21] "UAFA Cup"                            
## [22] "South Pacific Games"                 
## [23] "King's Cup"                          
## [24] "Gulf Cup"                            
## [25] "Indonesia Tournament"                
## [26] "Korea Cup"                           
## [27] "Oceania Nations Cup"                 
## [28] "CECAFA Cup"                          
## [29] "Kirin Cup"                           
## [30] "CFU Caribbean Cup qualification"     
## [31] "CFU Caribbean Cup"                   
## [32] "Amílcar Cabral Cup"                  
## [33] "Nehru Cup"                           
## [34] "UDEAC Cup"                           
## [35] "Island Games"                        
## [36] "UNCAF Cup"                           
## [37] "Gold Cup"                            
## [38] "Confederations Cup"                  
## [39] "Oceania Nations Cup qualification"   
## [40] "SAFF Cup"                            
## [41] "AFF Championship"                    
## [42] "Cyprus International Tournament"     
## [43] "COSAFA Cup"                          
## [44] "Gold Cup qualification"              
## [45] "WAFF Championship"                   
## [46] "EAFF Championship"                   
## [47] "AFC Challenge Cup"                   
## [48] "Viva World Cup"                      
## [49] "AFC Challenge Cup qualification"     
## [50] "African Nations Championship"        
## [51] "ConIFA World Football Cup"

Removal of matches( others vs others) and less important tournaments

data<-data[!(data$home_team=="others" & data$away_team=="others"),]
#24564
data<-data[!(data$tournament=="others"),]
#23911

Setting “Home” as country where Neutral is FALSE

for(i in 1:length(data$neutral))
{
  if(data$neutral[i]=="FALSE")
  {
    data$country[i]<-"HOME"
  }
}

Binning of Country (Excluding the Home teams)

for(i in 1:length(data$neutral))
{
  if(data$neutral[i]=="TRUE")
  {
    if (data$country[i] %nin% country$Country){
      data[i,"country"] <- "others"
    }
  }
}
unique(data$country)
##  [1] "HOME"                 "others"               "Chile"               
##  [4] "Philippines"          "Soviet Union"         "Finland"             
##  [7] "Tanganyika"           "Hong Kong"            "Zanzibar"            
## [10] "Singapore"            "Sudan"                "United Arab Republic"
## [13] "Korea DPR"            "India"                "Netherlands Antilles"
## [16] "Ethiopia"             "Lebanon"              "Israel"              
## [19] "Malaysia"             "Kuwait"               "Libya"               
## [22] "Congo"                "Haiti"                "Tanzania"            
## [25] "Pakistan"             "Mozambique"           "Cameroon"            
## [28] "Syria"                "Qatar"                "Liberia"             
## [31] "Mali"                 "Saudi Arabia"         "Réunion"             
## [34] "Swaziland"            "Honduras"             "United Arab Emirates"
## [37] "Oman"                 "Angola"               "Zaïre"               
## [40] "Jordan"               "Burkina Faso"         "Gabon"               
## [43] "Canada"               "Cyprus"               "South Africa"        
## [46] "Vietnam"              "El Salvador"          "Guatemala"           
## [49] "Ukraine"              "Equatorial Guinea"

Removing unnecessary columns not required for model

#Date
data<-data[-1]
#city
data<-data[-6]
#Score
data<-data[-3:-4]
#Year
data<-data[-6]
sapply(data, function(x) length(unique(x)))
##  home_team  away_team tournament    country    neutral      Month 
##         51         51         37         50          2         12 
##     Winner 
##          3
sapply(data, class)
##   home_team   away_team  tournament     country     neutral       Month 
## "character" "character" "character" "character"   "logical"   "integer" 
##      Winner 
## "character"

converting variables into factor

data$home_team<-as.factor(data$home_team)
data$away_team<-as.factor(data$away_team)
data$tournament<-as.factor(data$tournament)
data$country<-as.factor(data$country)
data$Winner<-as.factor(data$Winner)
data$neutral<-as.factor(data$neutral)
data$Month<-as.factor(data$Month)

Model Creation

Splitting the data into Train and Test

set.seed(123)
split = sample.split(data$Winner, SplitRatio = 0.75)
train_set = subset(data, split == TRUE)
test_set = subset(data, split == FALSE)

Random Forest

Training the model

set.seed(123)
classifier = randomForest(x = train_set[-7],
                          y = train_set$Winner,
                          ntree = 500)

Testing the trained model

y_pred = predict(classifier, newdata = test_set[-7])
cm = table(test_set[, 7], y_pred)

calculating accuracy

n = sum(cm) # number of instances
diag = diag(cm) # number of correctly classified instances per class
accuracy = sum(diag) / n
accuracy*100
## [1] 53.98126

Naive Bayes

Training the model

classifier = naiveBayes(x = train_set[-7],
                        y = train_set$Winner)

Testing the model

y_pred = predict(classifier, newdata = test_set[-7])
cm = table(test_set[, 7], y_pred)

calculating accuracy

n = sum(cm) # number of instances
diag = diag(cm) # number of correctly classified instances per class
accuracy = sum(diag) / n
accuracy*100
## [1] 54.93476

ROC

y_pred<-as.data.frame(y_pred)
 roc.home <- roc(ifelse(test_set$Winner=="Home", "Home", "non-Home"), as.numeric(y_pred$y_pred))
 roc.away <- roc(ifelse(test_set$Winner=="Away", "Away", "non-Away"), as.numeric(y_pred$y_pred))
 roc.tie <- roc(ifelse(test_set$Winner=="Tie", "Tie", "non-Tie"), as.numeric(y_pred$y_pred))
 
 plot(roc.home, col = "green", main="ROC Curve")
 lines(roc.away, col = "blue")
 lines(roc.tie, col = "red")
 legend("topleft", c("Home","Away","Tie"), fill=c("green","blue","red") )

AUC

 auc(roc.home)
## Area under the curve: 0.3717
 auc(roc.away)
## Area under the curve: 0.6452
 auc(roc.tie)
## Area under the curve: 0.4864